1 Loading libraries

library(tidyverse) # the usual stuff: dplyr, readr, and other goodies
library(lubridate) # to handle dates
library(GGally) # for correlation-scatter plot matrix
library(ggfortify) # to produce residual diagnostic plots
library(rsample) # to split dataframe in training- & testing sets
library(janitor) # clean_names()
library(broom) # use broom:augment() to get tidy table with regression output, residuals, etc
library(huxtable) # to get summary table of all models produced
library(kableExtra) # for formatting tables
library(moderndive) # for getting regression tables
library(skimr) # for skim
library(mosaic)
library(leaflet) # for interactive HTML maps
library(tidytext)
library(viridis)
library(vroom)
library(lmtest)
library(sandwich)
library(ggbeeswarm)
library(scales)
library(ggcorrplot)
library(ggthemes)

2 Downloading the data set

# use cache=TRUE so you dont donwload the data everytime you knit

listings <- vroom("http://data.insideairbnb.com/china/hk/hong-kong/2021-09-24/data/listings.csv.gz") %>% 
       clean_names()

3 Exploratory Data Analysis (EDA)

3.1 Step 1: Looking at the raw values

# Obtain an overview of the raw values
dplyr::glimpse(listings)
Rows: 6,046
Columns: 74
$ id                                           <dbl> 17891, 69074, 103760, 104~
$ listing_url                                  <chr> "https://www.airbnb.com/r~
$ scrape_id                                    <dbl> 2.021092e+13, 2.021092e+1~
$ last_scraped                                 <date> 2021-09-25, 2021-09-25, ~
$ name                                         <chr> "Large Light Filled Loft"~
$ description                                  <chr> "Gorgeous and spacious lo~
$ neighborhood_overview                        <chr> "Best neighborhood in Hon~
$ picture_url                                  <chr> "https://a0.muscache.com/~
$ host_id                                      <dbl> 69063, 160139, 304876, 54~
$ host_url                                     <chr> "https://www.airbnb.com/u~
$ host_name                                    <chr> "Candace", "Amy", "Brend"~
$ host_since                                   <date> 2010-01-09, 2010-07-07, ~
$ host_location                                <chr> "Los Angeles, California,~
$ host_about                                   <chr> "Hi, my name is Candace C~
$ host_response_time                           <chr> "within a day", "within a~
$ host_response_rate                           <chr> "100%", "100%", "100%", "~
$ host_acceptance_rate                         <chr> "0%", "75%", "89%", "N/A"~
$ host_is_superhost                            <lgl> FALSE, FALSE, FALSE, FALS~
$ host_thumbnail_url                           <chr> "https://a0.muscache.com/~
$ host_picture_url                             <chr> "https://a0.muscache.com/~
$ host_neighbourhood                           <chr> "Sheung Wan", "Sheung Wan~
$ host_listings_count                          <dbl> 1, 2, 9, 0, 9, 9, 9, 1, 1~
$ host_total_listings_count                    <dbl> 1, 2, 9, 0, 9, 9, 9, 1, 1~
$ host_verifications                           <chr> "['email', 'phone', 'revi~
$ host_has_profile_pic                         <lgl> TRUE, TRUE, TRUE, TRUE, T~
$ host_identity_verified                       <lgl> TRUE, TRUE, TRUE, TRUE, T~
$ neighbourhood                                <chr> "Hong Kong Island, Hong K~
$ neighbourhood_cleansed                       <chr> "Central & Western", "Cen~
$ neighbourhood_group_cleansed                 <lgl> NA, NA, NA, NA, NA, NA, N~
$ latitude                                     <dbl> 22.28327, 22.28350, 22.28~
$ longitude                                    <dbl> 114.1499, 114.1485, 114.1~
$ property_type                                <chr> "Entire rental unit", "En~
$ room_type                                    <chr> "Entire home/apt", "Entir~
$ accommodates                                 <dbl> 3, 3, 6, 2, 6, 6, 6, 2, 4~
$ bathrooms                                    <lgl> NA, NA, NA, NA, NA, NA, N~
$ bathrooms_text                               <chr> "1 bath", "1 bath", "1 ba~
$ bedrooms                                     <dbl> NA, 1, 2, 1, 2, 2, 2, 1, ~
$ beds                                         <dbl> 1, 2, 3, 1, 3, 3, 3, 1, 3~
$ amenities                                    <chr> "[\"Washer\", \"Air condi~
$ price                                        <chr> "$1,400.00", "$1,429.00",~
$ minimum_nights                               <dbl> 2, 2, 2, 1, 2, 2, 2, 1, 7~
$ maximum_nights                               <dbl> 365, 365, 365, 365, 365, ~
$ minimum_minimum_nights                       <dbl> 2, 2, 2, 1, 2, 2, 2, 1, 7~
$ maximum_minimum_nights                       <dbl> 2, 2, 2, 1, 2, 2, 2, 2, 7~
$ minimum_maximum_nights                       <dbl> 365, 365, 365, 365, 365, ~
$ maximum_maximum_nights                       <dbl> 365, 365, 365, 365, 365, ~
$ minimum_nights_avg_ntm                       <dbl> 2.0, 2.0, 2.0, 1.0, 2.0, ~
$ maximum_nights_avg_ntm                       <dbl> 365.0, 365.0, 365.0, 365.~
$ calendar_updated                             <lgl> NA, NA, NA, NA, NA, NA, N~
$ has_availability                             <lgl> TRUE, TRUE, TRUE, TRUE, T~
$ availability_30                              <dbl> 13, 0, 27, 30, 0, 0, 26, ~
$ availability_60                              <dbl> 43, 0, 57, 60, 0, 0, 56, ~
$ availability_90                              <dbl> 73, 13, 87, 90, 0, 23, 86~
$ availability_365                             <dbl> 318, 103, 252, 365, 116, ~
$ calendar_last_scraped                        <date> 2021-09-25, 2021-09-25, ~
$ number_of_reviews                            <dbl> 73, 135, 274, 14, 209, 22~
$ number_of_reviews_ltm                        <dbl> 0, 1, 2, 0, 4, 2, 3, 2, 0~
$ number_of_reviews_l30d                       <dbl> 0, 0, 2, 0, 0, 0, 1, 0, 0~
$ first_review                                 <date> 2016-03-10, 2012-06-29, ~
$ last_review                                  <date> 2017-11-29, 2019-08-03, ~
$ review_scores_rating                         <dbl> 4.76, 4.84, 4.44, 4.67, 4~
$ review_scores_accuracy                       <dbl> 4.73, 4.81, 4.39, 4.50, 4~
$ review_scores_cleanliness                    <dbl> 4.51, 4.77, 4.44, 4.86, 4~
$ review_scores_checkin                        <dbl> 4.92, 4.87, 4.46, 4.86, 4~
$ review_scores_communication                  <dbl> 4.93, 4.91, 4.60, 4.93, 4~
$ review_scores_location                       <dbl> 4.90, 4.90, 4.72, 4.79, 4~
$ review_scores_value                          <dbl> 4.66, 4.69, 4.40, 4.71, 4~
$ license                                      <lgl> NA, NA, NA, NA, NA, NA, N~
$ instant_bookable                             <lgl> FALSE, FALSE, FALSE, FALS~
$ calculated_host_listings_count               <dbl> 1, 1, 9, 1, 9, 9, 9, 1, 1~
$ calculated_host_listings_count_entire_homes  <dbl> 1, 1, 5, 1, 5, 5, 5, 1, 1~
$ calculated_host_listings_count_private_rooms <dbl> 0, 0, 4, 0, 4, 4, 4, 0, 0~
$ calculated_host_listings_count_shared_rooms  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0~
$ reviews_per_month                            <dbl> 1.08, 1.20, 3.08, 0.12, 2~

3.1.1 Key observations:

  • In total, there are 74 variables/columns and 6,046 observations/rows
  • The numeric variables in our dataset include: id, scrape_id, host_id, host_listings_count, host_total_listings_count, latitude, longitude, accommodates, bedrooms, beds, minimum_nights, maximum_nights, minimum_minimum_nights, maximum_minimum_nights, minimum_maximum_nights, maximum_maximum_nights, minimum_nights_avg_ntm, maximum_nights_avg_ntm, availability_30, availability_60, availability_90, availability_365, number_of_reviews, number_of_reviews_ltm, number_of_reviews_l30d, review_scores_rating, review_scores_accuracy, review_scores_cleanliness, review_scores_checkin, review_scores_communication, review_scores_location, review_scores_value, calculated_host_listings_count, calculated_host_listings_count_entire_homes, calculated_host_listings_count_private_rooms, calculated_host_listings_count_shared_rooms, reviews_per_month
  • The variables that are categorical or factors (or can be transformed into them) include: Host location, host response time, host_is_superhost, host_neighbourhood, host_verifications, host_has_profile_picture, host_identity_verified, neighbourhood, neighbourhood_cleansed, property_type, room_type, bathrooms_tex, amenities, has_availability, instant_bookable are categorial or factor variables. Some of these variables will be reduced to fewer (mutually exclusive) options
  • Price, host_response_rate and host_acceptance_rate should be numeric but have been stored as a character. Before computing the summary statistics or finding NAs, we will transform these variables into numeric ones
  • Some of the variables will be less meaningful (or practically impossible to incorporate) for our analysis and can be excluded from our dataset. More specifically, id, listing_url, scrape_id, last_scraped, name, description, neighborhood_overview, picture_url, host_id, host_url, host_name, host_about, host_thumbnail_url, host_picture_url, host_verifications, minimum_minimum_nights, maximum_minimum_nights, minimum_maximum_nights, maximum_maximum_nights, calendar_updated, calendar_last_scraped, first_review, last_review are eliminated
# Transform price, host_response_rate and host_acceptance_rate into numeric variables for summary statistics
listings <- listings %>% 
  mutate(price = parse_number(price),
         host_response_rate=parse_number(host_response_rate),
         host_acceptance_rate=parse_number(host_acceptance_rate))

# Check whether transformation has been successful
typeof(c(listings$price, listings$host_response_rate, listings$host_acceptance_rate))
[1] "double"
# Reduce dataset to exclude variables that are either irrelevant or too difficult to analyse
reduced_listings <- listings %>% 
  select(-c(id, listing_url, scrape_id, last_scraped, name, description, neighborhood_overview, picture_url, host_id, host_url, host_name, host_about, host_thumbnail_url, host_picture_url, host_verifications, minimum_minimum_nights, maximum_minimum_nights, minimum_maximum_nights, maximum_maximum_nights, calendar_updated, calendar_last_scraped, first_review, last_review))

3.2 Computing summary statistics of the variables of interest, or finding NAs

# See summary statistics (excl. minimum values for numeric variables)
skim(reduced_listings)
Data summary
Name reduced_listings
Number of rows 6046
Number of columns 51
_______________________
Column type frequency:
character 9
Date 1
logical 8
numeric 33
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
host_location 12 1.00 2 89 0 183 0
host_response_time 1 1.00 3 18 0 5 0
host_neighbourhood 824 0.86 2 33 0 144 0
neighbourhood 3486 0.42 9 70 0 178 0
neighbourhood_cleansed 0 1.00 5 17 0 18 0
property_type 0 1.00 3 35 0 70 0
room_type 0 1.00 10 15 0 4 0
bathrooms_text 22 1.00 6 17 0 35 0
amenities 0 1.00 2 1327 0 3844 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
host_since 1 1 2010-01-09 2021-09-19 2016-04-18 1268

Variable type: logical

skim_variable n_missing complete_rate mean count
host_is_superhost 1 1 0.22 FAL: 4718, TRU: 1327
host_has_profile_pic 1 1 1.00 TRU: 6035, FAL: 10
host_identity_verified 1 1 0.58 TRU: 3489, FAL: 2556
neighbourhood_group_cleansed 6046 0 NaN :
bathrooms 6046 0 NaN :
has_availability 0 1 1.00 TRU: 6028, FAL: 18
license 6046 0 NaN :
instant_bookable 0 1 0.28 FAL: 4349, TRU: 1697

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
host_response_rate 889 0.85 82.94 32.72 0.00 90.00 99.00 100.00 100.00 <U+2581><U+2581><U+2581><U+2581><U+2587>
host_acceptance_rate 1501 0.75 68.50 28.40 0.00 50.00 72.00 100.00 100.00 <U+2581><U+2582><U+2585><U+2585><U+2587>
host_listings_count 1 1.00 104.24 146.09 0.00 3.00 13.00 225.00 457.00 <U+2587><U+2581><U+2582><U+2581><U+2582>
host_total_listings_count 1 1.00 104.24 146.09 0.00 3.00 13.00 225.00 457.00 <U+2587><U+2581><U+2582><U+2581><U+2582>
latitude 0 1.00 22.30 0.05 22.16 22.28 22.30 22.31 22.56 <U+2581><U+2587><U+2581><U+2581><U+2581>
longitude 0 1.00 114.16 0.05 113.86 114.16 114.17 114.18 114.36 <U+2581><U+2581><U+2582><U+2587><U+2581>
accommodates 0 1.00 2.70 2.35 0.00 2.00 2.00 3.00 16.00 <U+2587><U+2582><U+2581><U+2581><U+2581>
bedrooms 1192 0.80 1.29 0.82 1.00 1.00 1.00 1.00 11.00 <U+2587><U+2581><U+2581><U+2581><U+2581>
beds 147 0.98 1.62 1.51 0.00 1.00 1.00 2.00 16.00 <U+2587><U+2581><U+2581><U+2581><U+2581>
price 0 1.00 805.04 2265.67 0.00 214.00 397.50 700.00 84346.00 <U+2587><U+2581><U+2581><U+2581><U+2581>
minimum_nights 0 1.00 17.02 30.10 1.00 1.00 5.00 29.00 1125.00 <U+2587><U+2581><U+2581><U+2581><U+2581>
maximum_nights 0 1.00 857.22 431.75 1.00 365.00 1125.00 1125.00 1125.00 <U+2582><U+2581><U+2581><U+2581><U+2587>
minimum_nights_avg_ntm 1 1.00 17.05 30.06 1.00 1.00 5.60 29.00 1125.00 <U+2587><U+2581><U+2581><U+2581><U+2581>
maximum_nights_avg_ntm 1 1.00 913.49 397.12 1.00 1124.00 1125.00 1125.00 1125.00 <U+2582><U+2581><U+2581><U+2581><U+2587>
availability_30 0 1.00 21.40 12.02 0.00 13.00 29.00 30.00 30.00 <U+2583><U+2581><U+2581><U+2581><U+2587>
availability_60 0 1.00 45.30 22.48 0.00 35.00 59.00 60.00 60.00 <U+2582><U+2581><U+2581><U+2581><U+2587>
availability_90 0 1.00 70.12 32.17 0.00 61.00 89.00 90.00 90.00 <U+2582><U+2581><U+2581><U+2581><U+2587>
availability_365 0 1.00 256.53 132.47 0.00 116.00 358.00 364.00 365.00 <U+2582><U+2582><U+2582><U+2581><U+2587>
number_of_reviews 0 1.00 17.62 45.93 0.00 0.00 0.00 10.00 891.00 <U+2587><U+2581><U+2581><U+2581><U+2581>
number_of_reviews_ltm 0 1.00 1.40 6.96 0.00 0.00 0.00 0.00 110.00 <U+2587><U+2581><U+2581><U+2581><U+2581>
number_of_reviews_l30d 0 1.00 0.12 0.65 0.00 0.00 0.00 0.00 16.00 <U+2587><U+2581><U+2581><U+2581><U+2581>
review_scores_rating 3067 0.49 4.38 0.92 0.00 4.29 4.65 4.89 5.00 <U+2581><U+2581><U+2581><U+2581><U+2587>
review_scores_accuracy 3130 0.48 4.58 0.67 0.00 4.50 4.78 5.00 5.00 <U+2581><U+2581><U+2581><U+2581><U+2587>
review_scores_cleanliness 3130 0.48 4.46 0.69 0.00 4.33 4.66 4.90 5.00 <U+2581><U+2581><U+2581><U+2581><U+2587>
review_scores_checkin 3130 0.48 4.67 0.62 0.00 4.65 4.86 5.00 5.00 <U+2581><U+2581><U+2581><U+2581><U+2587>
review_scores_communication 3131 0.48 4.68 0.61 1.00 4.67 4.88 5.00 5.00 <U+2581><U+2581><U+2581><U+2581><U+2587>
review_scores_location 3130 0.48 4.73 0.50 1.00 4.68 4.88 5.00 5.00 <U+2581><U+2581><U+2581><U+2581><U+2587>
review_scores_value 3130 0.48 4.48 0.65 1.00 4.35 4.64 4.85 5.00 <U+2581><U+2581><U+2581><U+2581><U+2587>
calculated_host_listings_count 0 1.00 96.62 137.08 1.00 3.00 11.00 185.00 376.00 <U+2587><U+2581><U+2581><U+2581><U+2582>
calculated_host_listings_count_entire_homes 0 1.00 18.74 38.32 0.00 0.00 2.00 16.00 155.00 <U+2587><U+2581><U+2581><U+2581><U+2581>
calculated_host_listings_count_private_rooms 0 1.00 70.73 119.53 0.00 0.00 5.00 49.00 333.00 <U+2587><U+2581><U+2581><U+2581><U+2581>
calculated_host_listings_count_shared_rooms 0 1.00 6.97 15.90 0.00 0.00 0.00 4.00 64.00 <U+2587><U+2581><U+2581><U+2581><U+2581>
reviews_per_month 3067 0.49 0.87 1.43 0.01 0.10 0.36 1.06 33.00 <U+2587><U+2581><U+2581><U+2581><U+2581>
# See minimum values for numeric values
summary(reduced_listings)%>% 
  kable(format = "html", caption = "Summary of Reduced Listings Data", format.args = list(scientific = FALSE, big.mark = ",")) %>% kable_classic()
Summary of Reduced Listings Data
host_since host_location host_response_time host_response_rate host_acceptance_rate host_is_superhost host_neighbourhood host_listings_count host_total_listings_count host_has_profile_pic host_identity_verified neighbourhood neighbourhood_cleansed neighbourhood_group_cleansed latitude longitude property_type room_type accommodates bathrooms bathrooms_text bedrooms beds amenities price minimum_nights maximum_nights minimum_nights_avg_ntm maximum_nights_avg_ntm has_availability availability_30 availability_60 availability_90 availability_365 number_of_reviews number_of_reviews_ltm number_of_reviews_l30d review_scores_rating review_scores_accuracy review_scores_cleanliness review_scores_checkin review_scores_communication review_scores_location review_scores_value license instant_bookable calculated_host_listings_count calculated_host_listings_count_entire_homes calculated_host_listings_count_private_rooms calculated_host_listings_count_shared_rooms reviews_per_month
Min. :2010-01-09 Length:6046 Length:6046 Min. : 0.00 Min. : 0.0 Mode :logical Length:6046 Min. : 0.0 Min. : 0.0 Mode :logical Mode :logical Length:6046 Length:6046 Mode:logical Min. :22.16 Min. :113.9 Length:6046 Length:6046 Min. : 0.000 Mode:logical Length:6046 Min. : 1.000 Min. : 0.000 Length:6046 Min. : 0.0 Min. : 1.00 Min. : 1.0 Min. : 1.00 Min. : 1.0 Mode :logical Min. : 0.0 Min. : 0.0 Min. : 0.00 Min. : 0.0 Min. : 0.00 Min. : 0.000 Min. : 0.0000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :1.000 Min. :1.000 Min. :1.000 Mode:logical Mode :logical Min. : 1.00 Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.0100
1st Qu.:2014-06-18 Class :character Class :character 1st Qu.: 90.00 1st Qu.: 50.0 FALSE:4718 Class :character 1st Qu.: 3.0 1st Qu.: 3.0 FALSE:10 FALSE:2556 Class :character Class :character NA’s:6046 1st Qu.:22.28 1st Qu.:114.2 Class :character Class :character 1st Qu.: 2.000 NA’s:6046 Class :character 1st Qu.: 1.000 1st Qu.: 1.000 Class :character 1st Qu.: 214.0 1st Qu.: 1.00 1st Qu.: 365.0 1st Qu.: 1.00 1st Qu.:1124.0 FALSE:18 1st Qu.:13.0 1st Qu.:35.0 1st Qu.:61.00 1st Qu.:116.0 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.:4.290 1st Qu.:4.500 1st Qu.:4.330 1st Qu.:4.650 1st Qu.:4.670 1st Qu.:4.680 1st Qu.:4.350 NA’s:6046 FALSE:4349 1st Qu.: 3.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.1000
Median :2016-04-18 Mode :character Mode :character Median : 99.00 Median : 72.0 TRUE :1327 Mode :character Median : 13.0 Median : 13.0 TRUE :6035 TRUE :3489 Mode :character Mode :character NA Median :22.30 Median :114.2 Mode :character Mode :character Median : 2.000 NA Mode :character Median : 1.000 Median : 1.000 Mode :character Median : 397.5 Median : 5.00 Median :1125.0 Median : 5.60 Median :1125.0 TRUE :6028 Median :29.0 Median :59.0 Median :89.00 Median :358.0 Median : 0.00 Median : 0.000 Median : 0.0000 Median :4.650 Median :4.780 Median :4.660 Median :4.860 Median :4.880 Median :4.880 Median :4.640 NA TRUE :1697 Median : 11.00 Median : 2.00 Median : 5.00 Median : 0.00 Median : 0.3600
Mean :2016-05-03 NA NA Mean : 82.94 Mean : 68.5 NA’s :1 NA Mean :104.2 Mean :104.2 NA’s :1 NA’s :1 NA NA NA Mean :22.30 Mean :114.2 NA NA Mean : 2.696 NA NA Mean : 1.285 Mean : 1.618 NA Mean : 805.0 Mean : 17.02 Mean : 857.2 Mean : 17.05 Mean : 913.5 NA Mean :21.4 Mean :45.3 Mean :70.12 Mean :256.5 Mean : 17.62 Mean : 1.399 Mean : 0.1209 Mean :4.384 Mean :4.575 Mean :4.465 Mean :4.672 Mean :4.681 Mean :4.733 Mean :4.475 NA NA Mean : 96.62 Mean : 18.74 Mean : 70.73 Mean : 6.97 Mean : 0.8739
3rd Qu.:2017-12-11 NA NA 3rd Qu.:100.00 3rd Qu.:100.0 NA NA 3rd Qu.:225.0 3rd Qu.:225.0 NA NA NA NA NA 3rd Qu.:22.31 3rd Qu.:114.2 NA NA 3rd Qu.: 3.000 NA NA 3rd Qu.: 1.000 3rd Qu.: 2.000 NA 3rd Qu.: 700.0 3rd Qu.: 29.00 3rd Qu.:1125.0 3rd Qu.: 29.00 3rd Qu.:1125.0 NA 3rd Qu.:30.0 3rd Qu.:60.0 3rd Qu.:90.00 3rd Qu.:364.0 3rd Qu.: 10.00 3rd Qu.: 0.000 3rd Qu.: 0.0000 3rd Qu.:4.890 3rd Qu.:5.000 3rd Qu.:4.900 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:4.850 NA NA 3rd Qu.:185.00 3rd Qu.: 16.00 3rd Qu.: 49.00 3rd Qu.: 4.00 3rd Qu.: 1.0600
Max. :2021-09-19 NA NA Max. :100.00 Max. :100.0 NA NA Max. :457.0 Max. :457.0 NA NA NA NA NA Max. :22.56 Max. :114.4 NA NA Max. :16.000 NA NA Max. :11.000 Max. :16.000 NA Max. :84346.0 Max. :1125.00 Max. :1125.0 Max. :1125.00 Max. :1125.0 NA Max. :30.0 Max. :60.0 Max. :90.00 Max. :365.0 Max. :891.00 Max. :110.000 Max. :16.0000 Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000 NA NA Max. :376.00 Max. :155.00 Max. :333.00 Max. :64.00 Max. :33.0000
NA’s :1 NA NA NA’s :889 NA’s :1501 NA NA NA’s :1 NA’s :1 NA NA NA NA NA NA NA NA NA NA NA NA NA’s :1192 NA’s :147 NA NA NA NA NA’s :1 NA’s :1 NA NA NA NA NA NA NA NA NA’s :3067 NA’s :3130 NA’s :3130 NA’s :3130 NA’s :3131 NA’s :3130 NA’s :3130 NA NA NA NA NA NA NA’s :3067

3.2.1 Key observations:

  • There appear to be quite a few missing values in several variables. The variables with the lowest “complete rate” are neighborhood, first_review, last_review, review_scores_rating, review_scores_accuracy, review_scores_cleanliness, review_scores_checkin, review_scores_communication, review_scores_location, review_scores_value, reviews_per_month
  • A first glimpse at the histograms that are shown for the numeric variables, show that most numeric variables are either skewed to the left or skewed to the right. Our main variable of interest, price, appears to be skewed to the right. In the data wrangling section, this variable will undergo log transformation to correct for this skewness
  • The dataset contains a few unusual observations. For example, there is an observation which accommodates 0 people, another observation with 0 beds, a price of 0, and 0 host listings. In the data wrangling section, we will eliminate some of these observations

3.3 Creating informative visualisations

First, we will transform price into a new variable price_4_nights, decide whether it should be transformed to a logarithm, and change some categorical variables into fewer variables. Next, we will consider the correlation between variables. Rather than creating visualisations for all 51 variables, we will only create visualisations for the variables that are intuitively related to the price of a listing and are likely to be included in our regression model

3.3.1 Price

For ease of analysis, we will create a price for 4 nights variable since that is the objective of this study

# Creating the price per 4 nights
reduced_listings <- reduced_listings %>%
mutate(price_4_nights = (price*4))

Next, we will inspect the distribution of the variables

# Distribution of price_4_nights
ggplot(reduced_listings, aes(x = price_4_nights)) +
  geom_density()+
  labs(title = "Price per 4 nights", subtitle = "Density",
       x = "Price per 4 nights",
       y = "Number of listings") +
  theme_economist_white()+theme(axis.title.y = element_text(margin = margin(r=3)),axis.title.x = element_text(margin=margin(t=3)))+
  NULL

Given the distribution of price_4_nights, it appears that the the price_4_nights variable is skewed to the right

To correct for this, we can log transform the variable to create a normal distribution, which will help with analysis and regression

# Creating the log(price per 4 nights)
reduced_listings <- reduced_listings %>%
mutate(log_price_4_nights=log(price_4_nights))

# Check distribution of data

# Distribution of log_price_4_nights
ggplot(reduced_listings, aes(x = log_price_4_nights)) +
  geom_density()+
  labs(title = "Log price per 4 nights", subtitle = "Density",
       x = "Log price per 4 nights",
       y = "Number of listings") +
  theme_economist_white()+ theme(axis.title.y = element_text(margin = margin(r=3)),axis.title.x = element_text(margin=margin(t=3)))+
  NULL

The data is now approximately normally distributed

3.3.2 Property types

# Look at the distribution of property types
reduced_listings %>%
count(property_type, name="count", sort=TRUE) %>%
  mutate(percent=round(100*count/sum(count),2)) %>%
  rename(c("Property type" = "property_type", "Count" = "count", "Percentage" = "percent")) %>% 
  kable() %>% 
  kable_classic()
Property type Count Percentage
Private room in rental unit 2120 35.06
Entire rental unit 1360 22.49
Entire condominium (condo) 419 6.93
Entire serviced apartment 347 5.74
Shared room in rental unit 236 3.90
Private room in serviced apartment 196 3.24
Private room in condominium (condo) 182 3.01
Private room in hostel 174 2.88
Room in hotel 118 1.95
Private room in guesthouse 116 1.92
Room in boutique hotel 98 1.62
Private room in guest suite 66 1.09
Private room in residential home 64 1.06
Entire residential home 61 1.01
Shared room in hostel 56 0.93
Room in hostel 52 0.86
Private room in bed and breakfast 44 0.73
Entire guest suite 37 0.61
Room in aparthotel 27 0.45
Entire loft 21 0.35
Entire place 20 0.33
Room in bed and breakfast 17 0.28
Entire guesthouse 16 0.26
Shared room in condominium (condo) 15 0.25
Private room 13 0.22
Entire cottage 12 0.20
Private room in villa 11 0.18
Room in serviced apartment 11 0.18
Entire bungalow 8 0.13
Private room in tiny house 8 0.13
Private room in townhouse 8 0.13
Shared room in guesthouse 8 0.13
Shared room in residential home 8 0.13
Entire townhouse 7 0.12
Entire villa 6 0.10
Private room in loft 6 0.10
Tiny house 6 0.10
Houseboat 5 0.08
Private room in bungalow 5 0.08
Shared room in boutique hotel 5 0.08
Shared room in serviced apartment 5 0.08
Private room in cottage 4 0.07
Shared room in nature lodge 4 0.07
Boat 3 0.05
Farm stay 3 0.05
Shared room in bed and breakfast 3 0.05
Tent 3 0.05
Campsite 2 0.03
Cave 2 0.03
Pension 2 0.03
Private room in minsu 2 0.03
Shared room 2 0.03
Shared room in boat 2 0.03
Shared room in guest suite 2 0.03
Shared room in tiny house 2 0.03
Shared room in townhouse 2 0.03
Castle 1 0.02
Earth house 1 0.02
Entire home/apt 1 0.02
Hut 1 0.02
Island 1 0.02
Private room in boat 1 0.02
Private room in cabin 1 0.02
Private room in casa particular 1 0.02
Private room in earth house 1 0.02
Private room in hut 1 0.02
Private room in kezhan 1 0.02
Private room in nature lodge 1 0.02
Shared room in dome house 1 0.02
Shared room in pension 1 0.02
# Organise into 5 groups
reduced_listings <- reduced_listings %>%
  mutate(prop_type_simplified = case_when(
    property_type %in% c("Private room in rental unit","Entire rental unit", "Entire condominium (condo)","Entire serviced apartment") ~ property_type,
    TRUE ~ "Other"
  ))

# Check whether new categorisation has been successful
reduced_listings %>%
count(prop_type_simplified, name="count", sort=TRUE) %>%
  mutate(percent=round(100*count/sum(count),2)) %>%
  rename(c("Property type" = "prop_type_simplified", "Count" = "count", "Percentage" = "percent")) %>% 
    kable() %>% 
  kable_classic()
Property type Count Percentage
Private room in rental unit 2120 35.06
Other 1800 29.77
Entire rental unit 1360 22.49
Entire condominium (condo) 419 6.93
Entire serviced apartment 347 5.74

The top four property types are Private room in rental unit, Entire rental unit, Entire condominium (condo) and Entire serviced apartment. Together, they make up approximately 70% of the total listings

# Create boxplot
chart_proptype <- reduced_listings %>% 
  ggplot(aes(x = prop_type_simplified, y = log_price_4_nights, group = prop_type_simplified)) +
    geom_boxplot() +
    coord_flip() +
    theme_economist_white() +
    labs(
      title = "Entire condo and entire rental unit exhibit highest median prices",
      subtitle = "Greatest dispersion in 'Other', as remaining variables are grouped here",
      x = element_blank(),
      y = "Log Price for 4 nights"
    ) +
  theme(panel.border = element_blank())+theme(axis.title.y = element_text(margin = margin(r=3)),axis.title.x = element_text(margin=margin(t=3)))
         
chart_proptype

The distribution is exactly as expected. Entire condos and entire rental units have the highest median prices. Private rooms in a rental unit are naturally the cheapest listings. As we grouped the remaining values of the variables in “Other”, there are many extreme outliers in this group.

3.3.3 Neighbourhoods

# Look at the distribution of neighbourhoods
reduced_listings %>%
count(neighbourhood_cleansed, name="count", sort=TRUE) %>%
  mutate(percent=round(100*count/sum(count),2)) %>%
  rename(c("Neighbourhood" = "neighbourhood_cleansed", "Count" = "count", "Percentage" = "percent")) %>% 
   kable() %>% 
  kable_classic()
Neighbourhood Count Percentage
Yau Tsim Mong 2351 38.89
Wan Chai 1370 22.66
Central & Western 920 15.22
Islands 268 4.43
Kowloon City 243 4.02
Eastern 233 3.85
Sham Shui Po 138 2.28
Yuen Long 130 2.15
North 120 1.98
Sai Kung 71 1.17
Southern 42 0.69
Sha Tin 41 0.68
Tuen Mun 35 0.58
Kwun Tong 27 0.45
Tai Po 23 0.38
Tsuen Wan 20 0.33
Kwai Tsing 7 0.12
Wong Tai Sin 7 0.12
# Organise into 5 groups
reduced_listings <- reduced_listings %>%
  mutate(neighbourhood_categorical = case_when(
    neighbourhood_cleansed %in% c("Yau Tsim Mong","Wan Chai", "Central & Western","Islands") ~ neighbourhood_cleansed,
    TRUE ~ "Other"
  ))

# Check whether new categorisation has been successful
reduced_listings %>%
count(neighbourhood_categorical, name="count", sort=TRUE) %>%
  mutate(percent=round(100*count/sum(count),2)) %>%
  rename(c("Neighbourhood" = "neighbourhood_categorical", "Count" = "count", "Percentage" = "percent")) %>% 
   kable() %>% 
  kable_classic()
Neighbourhood Count Percentage
Yau Tsim Mong 2351 38.89
Wan Chai 1370 22.66
Other 1137 18.81
Central & Western 920 15.22
Islands 268 4.43

The top four neighbourhoods are Yau Tsim Mong, Wan Chai, Central & Western and Islands. Together, they make up approximately 80% of the total listings.

# Create boxplot
chart_neighbourhood <- reduced_listings %>% 
  ggplot(aes(x = neighbourhood_categorical, y = log_price_4_nights, group = neighbourhood_categorical)) +
    geom_boxplot() +
    coord_flip() +
    theme_economist_white() +
    labs(
      title = "Listings in the location 'Islands' have the highest median price",
      subtitle = "Greatest dispersion in 'Other'",
      x = element_blank(),
      y = "Log Price for 4 nights"
    ) +
  theme(panel.border = element_blank()) + theme(axis.title.y = element_text(margin = margin(r=3)),axis.title.x = element_text(margin=margin(t=3)))
         
chart_neighbourhood

Overall, the boxplot shows many outliers with high prices for all neighbourhoods. As we expected, listings on islands have the highest median price. These should be more exotic locations for more affluent travelers with higher real estate prices. Wan Chai is one of the busiest commercial areas in Hong Kong. The buildings are tall, the apartments small and hence the prices for Airbnb apartments are also on the lower end.

3.3.4 Bathroom text

# Look at the distribution of bathroom text
reduced_listings %>%
  filter(bathrooms_text!="NA") %>% 
count(bathrooms_text, name="count", sort=TRUE) %>%
  mutate(percent=round(100*count/sum(count),2)) %>%
  rename(c("Bathroom description" = "bathrooms_text", "Count" = "count", "Percentage" = "percent")) %>% 
   kable() %>% 
  kable_classic()
Bathroom description Count Percentage
1 bath 2840 47.14
1 private bath 1196 19.85
1 shared bath 1031 17.11
2 baths 276 4.58
1.5 baths 212 3.52
2 shared baths 151 2.51
1.5 shared baths 49 0.81
3 shared baths 42 0.70
3 baths 41 0.68
4 shared baths 27 0.45
4 baths 21 0.35
0 baths 18 0.30
Half-bath 17 0.28
Private half-bath 16 0.27
Shared half-bath 16 0.27
2.5 baths 14 0.23
2.5 shared baths 10 0.17
0 shared baths 9 0.15
5 baths 7 0.12
8 baths 6 0.10
9 baths 4 0.07
10 baths 3 0.05
3.5 shared baths 3 0.05
6 baths 3 0.05
5.5 baths 2 0.03
10 shared baths 1 0.02
11 baths 1 0.02
3.5 baths 1 0.02
4.5 baths 1 0.02
4.5 shared baths 1 0.02
5 shared baths 1 0.02
5.5 shared baths 1 0.02
7 baths 1 0.02
8 shared baths 1 0.02
9 shared baths 1 0.02
# Organise into 5 groups
reduced_listings <- reduced_listings %>%
  mutate(bathrooms = case_when(
    bathrooms_text %in% c("1 bath","1 private bath", "1 shared bath","2 baths") ~ bathrooms_text,
    TRUE ~ "Other"
  ))

# Check whether new categorisation has been successful
reduced_listings %>%
count(bathrooms, name="count", sort=TRUE) %>%
  mutate(percent=round(100*count/sum(count),2)) %>%
  rename(c("Bathroom description" = "bathrooms", "Count" = "count", "Percentage" = "percent")) %>% 
   kable() %>% 
  kable_classic()
Bathroom description Count Percentage
1 bath 2840 46.97
1 private bath 1196 19.78
1 shared bath 1031 17.05
Other 703 11.63
2 baths 276 4.57

The top four bathroom descriptions are 1 bath, 1 private bath, 1 shared bath and 2 baths. Together, they make up approximately 90% of the total listings.

# Create boxplot
chart_baths <- reduced_listings %>% 
  ggplot(aes(x = bathrooms, y = log_price_4_nights, group = bathrooms)) +
    geom_boxplot() +
    coord_flip() +
    theme_economist_white() +
    labs(
      title = "The more bathrooms the higher the price",
      x = element_blank(),
      y = "Log Price for 4 nights"
    ) +
  theme(panel.border = element_blank()) + theme(axis.title.y = element_text(margin = margin(r=3)),axis.title.x = element_text(margin=margin(t=3)))
         
chart_baths

No surprises to see here. One shared bathroom clearly corresponds to the lowest price. A private bathroom is more comfortable to use and has a higher median price. Two bathrooms correspond by far to the highest price. All of this was expected.

3.3.5 Amenities

To expand our analysis, we have decided to decompose the amenities into important amenities, accessibility amenities, space amenities and gear amenities

# Grouping together amenities for data analysis
reduced_listings$amenity_count <- str_count(reduced_listings$amenities,",")+1
head(reduced_listings$amenities)
[1] "[\"Washer\", \"Air conditioning\", \"Long term stays allowed\", \"Wifi\", \"Kitchen\", \"Elevator\"]"                                                                                                                                                                                                                                                                                                                                                                                                           
[2] "[\"Heating\", \"Air conditioning\", \"Shampoo\", \"Long term stays allowed\", \"Essentials\", \"Bathtub\", \"Coffee maker\", \"Dishes and silverware\", \"Carbon monoxide alarm\", \"TV with standard cable\", \"Iron\", \"Oven\", \"Hot water\", \"Room-darkening shades\", \"Kitchen\", \"Cable TV\", \"Dryer\", \"First aid kit\", \"Wifi\", \"Cooking basics\", \"Hair dryer\", \"Washer\", \"Hangers\", \"Breakfast\", \"Dedicated workspace\", \"Refrigerator\", \"Smoke alarm\", \"Stove\", \"Lockbox\"]"
[3] "[\"Heating\", \"Washer\", \"Air conditioning\", \"Crib\", \"Shampoo\", \"TV with standard cable\", \"Long term stays allowed\", \"Iron\", \"Wifi\", \"Dedicated workspace\", \"Cooking basics\", \"Kitchen\", \"Hair dryer\", \"Lockbox\", \"Essentials\", \"Cable TV\"]"                                                                                                                                                                                                                                       
[4] "[\"Air conditioning\", \"TV with standard cable\", \"Long term stays allowed\", \"Wifi\", \"Kitchen\", \"Cable TV\"]"                                                                                                                                                                                                                                                                                                                                                                                           
[5] "[\"Heating\", \"Washer\", \"Air conditioning\", \"Crib\", \"Shampoo\", \"TV with standard cable\", \"Long term stays allowed\", \"Iron\", \"Wifi\", \"Dedicated workspace\", \"Cooking basics\", \"Kitchen\", \"Hair dryer\", \"Lockbox\", \"Essentials\", \"Cable TV\", \"Elevator\"]"                                                                                                                                                                                                                         
[6] "[\"Heating\", \"Air conditioning\", \"Shampoo\", \"Long term stays allowed\", \"Essentials\", \"Dishes and silverware\", \"TV with standard cable\", \"Iron\", \"Hot water\", \"Kitchen\", \"Cable TV\", \"Crib\", \"Microwave\", \"Wifi\", \"Cooking basics\", \"Hair dryer\", \"Hangers\", \"Fire extinguisher\", \"Dedicated workspace\", \"Refrigerator\", \"Smoke alarm\", \"Lockbox\"]"                                                                                                                   
amenities <- unique(reduced_listings$amenities)
head(reduced_listings$amenities)
[1] "[\"Washer\", \"Air conditioning\", \"Long term stays allowed\", \"Wifi\", \"Kitchen\", \"Elevator\"]"                                                                                                                                                                                                                                                                                                                                                                                                           
[2] "[\"Heating\", \"Air conditioning\", \"Shampoo\", \"Long term stays allowed\", \"Essentials\", \"Bathtub\", \"Coffee maker\", \"Dishes and silverware\", \"Carbon monoxide alarm\", \"TV with standard cable\", \"Iron\", \"Oven\", \"Hot water\", \"Room-darkening shades\", \"Kitchen\", \"Cable TV\", \"Dryer\", \"First aid kit\", \"Wifi\", \"Cooking basics\", \"Hair dryer\", \"Washer\", \"Hangers\", \"Breakfast\", \"Dedicated workspace\", \"Refrigerator\", \"Smoke alarm\", \"Stove\", \"Lockbox\"]"
[3] "[\"Heating\", \"Washer\", \"Air conditioning\", \"Crib\", \"Shampoo\", \"TV with standard cable\", \"Long term stays allowed\", \"Iron\", \"Wifi\", \"Dedicated workspace\", \"Cooking basics\", \"Kitchen\", \"Hair dryer\", \"Lockbox\", \"Essentials\", \"Cable TV\"]"                                                                                                                                                                                                                                       
[4] "[\"Air conditioning\", \"TV with standard cable\", \"Long term stays allowed\", \"Wifi\", \"Kitchen\", \"Cable TV\"]"                                                                                                                                                                                                                                                                                                                                                                                           
[5] "[\"Heating\", \"Washer\", \"Air conditioning\", \"Crib\", \"Shampoo\", \"TV with standard cable\", \"Long term stays allowed\", \"Iron\", \"Wifi\", \"Dedicated workspace\", \"Cooking basics\", \"Kitchen\", \"Hair dryer\", \"Lockbox\", \"Essentials\", \"Cable TV\", \"Elevator\"]"                                                                                                                                                                                                                         
[6] "[\"Heating\", \"Air conditioning\", \"Shampoo\", \"Long term stays allowed\", \"Essentials\", \"Dishes and silverware\", \"TV with standard cable\", \"Iron\", \"Hot water\", \"Kitchen\", \"Cable TV\", \"Crib\", \"Microwave\", \"Wifi\", \"Cooking basics\", \"Hair dryer\", \"Hangers\", \"Fire extinguisher\", \"Dedicated workspace\", \"Refrigerator\", \"Smoke alarm\", \"Lockbox\"]"                                                                                                                   
#remove excess characters
amenities = str_remove_all(amenities,"\"")
amenities = str_remove_all(amenities,"\\[")
amenities = str_remove_all(amenities,"\\]")
amenities = str_trim(amenities)
total_amenities <- str_trim(unlist(strsplit(amenities,"[,]")))
head(total_amenities,10)
 [1] "Washer"                  "Air conditioning"       
 [3] "Long term stays allowed" "Wifi"                   
 [5] "Kitchen"                 "Elevator"               
 [7] "Heating"                 "Air conditioning"       
 [9] "Shampoo"                 "Long term stays allowed"
as.data.frame(table(total_amenities))%>%
  arrange(desc(Freq)) %>% 
  rename(c("Amenities" = "total_amenities", "Frequency" = "Freq")) %>% 
  kable() %>% 
  kable_classic()
Amenities Frequency
Air conditioning 3702
Long term stays allowed 3645
Wifi 3613
Essentials 3133
Hangers 2851
Hair dryer 2747
Shampoo 2690
Kitchen 2469
Elevator 2395
TV 2323
Hot water 2268
Dedicated workspace 2193
Washer 2170
Iron 1959
Smoke alarm 1894
Fire extinguisher 1861
Refrigerator 1301
Private entrance 1283
Bed linens 1183
Lock on bedroom door 1144
Cooking basics 1069
Dishes and silverware 1025
Heating 1019
Dryer 979
First aid kit 975
Microwave 962
Carbon monoxide alarm 860
Stove 735
Luggage dropoff allowed 686
Extra pillows and blankets 651
Cable TV 526
TV with standard cable 511
Host greets you 457
Paid parking off premises 381
Shower gel 374
Oven 369
Gym 363
Room-darkening shades 347
Coffee maker 344
Keypad 344
Hot water kettle 306
Patio or balcony 304
Smart lock 302
Ethernet connection 294
Pool 272
Cleaning before checkout 258
Lockbox 225
Hot tub 219
Free parking on premises 217
Paid parking on premises 196
Security cameras on property 195
Dining table 194
Bathtub 193
Freezer 192
Cleaning products 175
Building staff 173
Breakfast 169
Window guards 157
Free street parking 153
Body soap 151
Crib 148
Single level home 144
BBQ grill 141
Clothing storage 136
Drying rack for clothing 131
Conditioner 125
Wine glasses 121
Children books and toys 112
Indoor fireplace 112
Rice maker 111
Backyard 106
Toaster 104
Beachfront 100
High chair 96
Outdoor furniture 96
Laundromat nearby 89
Outdoor dining area 77
Pack Play/travel crib 73
Mini fridge 72
Waterfront 71
Pocket wifi 70
Baby bath 68
Children dinnerware 68
Portable fans 67
Dishwasher 63
Safe 62
Babysitter recommendations 52
Clothing storage: closet 51
Barbecue utensils 50
Baking sheet 49
Free washer In unit 49
Table corner guards 44
Game console 40
Outlet covers 39
Private patio or balcony 39
Sound system 35
Board games 32
Nespresso machine 29
EV charger 26
Bread maker 24
Electric stove 22
Portable heater 22
Changing table 21
Beach essentials 20
Ceiling fan 20
Outdoor shower 19
Sound system with Bluetooth and aux 19
Window AC unit 18
Bluetooth sound system 16
Induction stove 16
Pour-over coffee 16
Free dryer In unit 14
Baby safety gates 13
Clothing storage: wardrobe 13
Washer unit 13
Netflix 12
Stainless steel electric stove 12
Wifi 1000 Mbps 12
Dedicated workspace: desk 11
Piano 11
Stainless steel oven 11
Dedicated workspace: table 10
Mosquito net 10
Shared pool 10
standard cable 10
Gas stove 9
Private hot tub 9
Central air conditioning 8
Central heating 8
Lake access 8
Paid parking garage off premises 8
Panasonic refrigerator 8
Record player 8
Baby monitor 7
Paid street parking off premises 7
Trash compactor 7
Washer building 7
Clothing storage: wardrobe and closet 6
Dryer unit 6
Bidet 5
Bikes 5
Clothing storage: closet and wardrobe 5
Game console: PS4 5
HDTV with Netflix 5
premium cable 5
Shared patio or balcony 5
32 HDTV with Chromecast 4
and office chair 4
Clothing storage: dresser 4
Dryer In building 4
Fire pit 4
Fireplace guards 4
Kayak 4
Paid parking lot off premises 4
Ping pong table 4
Pool table 4
Private fenced garden or backyard 4
Private garden or backyard 4
Radiant heating 4
Stainless steel induction stove 4
and desk 3
Clothing storage: closet and dresser 3
Clothing storage: dresser and closet 3
Clothing storage: walk-in closet 3
Dedicated workspace: desk and office chair 3
Dedicated workspace: monitor 3
Free dryer In building 3
Free washer In building 3
HBO Max 3
monitor 3
office chair 3
Private gym in building 3
Private pool 3
Sauna 3
Shared garden or backyard 3
Shared sauna 3
table 3
Wifi 100 Mbps 3
conditioner 2
and closet 2
and wardrobe 2
Apple TV 2
Boat slip 2
Clothing storage: wardrobe and walk-in closet 2
desk 2
dresser 2
Free driveway parking on premises 2
Free dryer 2
HDTV 2
HDTV with standard cable 2
Keurig coffee machine 2
Shared gym 2
Shared gym in building 2
Ski-in/Ski-out 2
Stainless steel gas stove 2
wardrobe 2
shampoo 1
32 HDTV 1
40 HDTV 1
40 TV with standard cable 1
42 HDTV with Apple TV 1
42 HDTV with HBO Max 1
42 HDTV with Netflix 1
42 HDTV with standard cable 1
43 HDTV 1
44 HDTV with Amazon Prime Video 1
48 HDTV with Apple TV 1
50 HDTV with Apple TV 1
55 HDTV with standard cable 1
55 TV with standard cable 1
65 HDTV with Amazon Prime Video 1
65 TV 1
Aesop body soap 1
and table 1
Any body soap 1
B&O sound system with Bluetooth and aux 1
Bathrobes 1
Bed Head TIGI shampoo 1
Bed sheets and pillows 1
Bluetooth speaker 1
bose Bluetooth sound system 1
Bottled water 1
Breakfast buffet available $295 per person per day 1
CAMBRIDGE Bluetooth sound system 1
Children books and toys for ages 2-5 years old 1
Children books and toys for ages 2-5 years old and 5-10 years old 1
Chromecast 1
Clothing storage: closet and walk-in closet 1
Clothing storage: walk-in closet and dresser 1
Complimentary self parking 1
Complimentary valet parking 1
Concierge 1
Dedicated workspace: office chair 1
Dedicated workspace: office chair and desk 1
Dedicated workspace: office chair and table 1
Dedicated workspace: table and desk 1
Depends body soap 1
Dole body soap 1
Dove or similar body soap 1
Elemis body soap 1
Fitness center 1
Fortress refrigerator 1
Free carport on premises 3 spaces 1
Free washer 1
Free wifi 1
Gaggenau refrigerator 1
Game console: Nintendo Switch 1
Game console: Nintendo Wii 1
HDTV with Apple TV 1
Heated pool with poolside bar outdoor 1
Hitachi refrigerator 1
I have a small oven to cook basics. It is not big enough to roast a chicken! oven 1
J&J body soap 1
JBL Bluetooth sound system 1
JBL speaker Bluetooth sound system 1
Laundry services 1
LG refrigerator 1
Limited housekeeping weekly 1
Marshall Bluetooth sound system 1
Minibar 1
Onsite restaurant Above & Beyond 1
Paid dryer In unit 1
Paid valet parking on premises 1
Paid washer In building 1
Pansonic refrigerator 1
Phill Smith Be Gourgeous conditioner 1
Portable air conditioning 1
Private outdoor pool 1
Rejoice shampoo 1
Room service 1
samsung refrigerator 1
shampoo and conditioner. Shower wash. shampoo 1
Shared fenced garden or backyard 1
Shared outdoor heated infinity pool 1
Shared outdoor heated pool 1
Simpa stainless steel gas stove 1
Slippers 1
Speaker Bluetooth sound system 1
Supply the basics 1
Thai organic brand conditioner 1
Thai organic brand shampoo 1
Toiletries 1
toshiba stainless steel oven 1
Tresseme conditioner 1
TV with Amazon Prime Video 1
TV with Apple TV 1
TV with Netflix 1
Unknow oven 1
Unknown body soap 1
walk-in closet 1
Whirlpool induction stove 1
Whirpool refrigerator 1
White Westinhouse refrigerator 1
Wifi 174 Mbps 1
Wifi 20 Mbps 1
reduced_listings$amenities = str_remove_all(reduced_listings$amenities,"\"")
reduced_listings$amenities = str_remove_all(reduced_listings$amenities,"\\[")
reduced_listings$amenities = str_remove_all(reduced_listings$amenities,"\\]")
reduced_listings$amenities = str_trim(reduced_listings$amenities)
head(reduced_listings$amenities)
[1] "Washer, Air conditioning, Long term stays allowed, Wifi, Kitchen, Elevator"                                                                                                                                                                                                                                                                                                               
[2] "Heating, Air conditioning, Shampoo, Long term stays allowed, Essentials, Bathtub, Coffee maker, Dishes and silverware, Carbon monoxide alarm, TV with standard cable, Iron, Oven, Hot water, Room-darkening shades, Kitchen, Cable TV, Dryer, First aid kit, Wifi, Cooking basics, Hair dryer, Washer, Hangers, Breakfast, Dedicated workspace, Refrigerator, Smoke alarm, Stove, Lockbox"
[3] "Heating, Washer, Air conditioning, Crib, Shampoo, TV with standard cable, Long term stays allowed, Iron, Wifi, Dedicated workspace, Cooking basics, Kitchen, Hair dryer, Lockbox, Essentials, Cable TV"                                                                                                                                                                                   
[4] "Air conditioning, TV with standard cable, Long term stays allowed, Wifi, Kitchen, Cable TV"                                                                                                                                                                                                                                                                                               
[5] "Heating, Washer, Air conditioning, Crib, Shampoo, TV with standard cable, Long term stays allowed, Iron, Wifi, Dedicated workspace, Cooking basics, Kitchen, Hair dryer, Lockbox, Essentials, Cable TV, Elevator"                                                                                                                                                                         
[6] "Heating, Air conditioning, Shampoo, Long term stays allowed, Essentials, Dishes and silverware, TV with standard cable, Iron, Hot water, Kitchen, Cable TV, Crib, Microwave, Wifi, Cooking basics, Hair dryer, Hangers, Fire extinguisher, Dedicated workspace, Refrigerator, Smoke alarm, Lockbox"                                                                                       
# Find count of important amenities in amenities
reduced_listings$important_amenities = ifelse(grepl("Wifi",reduced_listings$amenities,ignore.case = TRUE,fixed = TRUE),1,0)+ifelse(grepl("Air conditioning",reduced_listings$amenities,ignore.case = TRUE,fixed = TRUE),1,0)+ifelse(grepl("Essentials",reduced_listings$amenities,ignore.case = TRUE,fixed = TRUE),1,0)
head(reduced_listings$important_amenities)  
[1] 2 3 3 2 3 3
# Find count of accessibility amenities in amenities
reduced_listings$accessibility = ifelse(grepl("Elevator",reduced_listings$amenities,ignore.case = TRUE,fixed = TRUE),1,0)+ifelse(grepl("Private Entrance",reduced_listings$amenities,ignore.case = TRUE,fixed = TRUE),1,0)
head(reduced_listings$accessibility)
[1] 1 0 0 0 1 0
#find count of space amenities in amenities
reduced_listings$spacing = ifelse(grepl("Kitchen",reduced_listings$amenities,ignore.case = TRUE,fixed = TRUE),1,0)+ifelse(grepl("Workspace",reduced_listings$amenities,ignore.case = TRUE,fixed = TRUE),1,0)
head(reduced_listings$spacing)
[1] 1 1 1 1 1 1
#find count of gear amenities in amenities
reduced_listings$gear_amenities = ifelse(grepl("TV",reduced_listings$amenities,ignore.case = TRUE,fixed = TRUE),1,0)+ifelse(grepl("Washer",reduced_listings$amenities,ignore.case = TRUE,fixed = TRUE),1,0)
head(reduced_listings$accessibility)
[1] 1 0 0 0 1 0

3.3.6 Correlation 1/4 - Review Scores

# Show correlation among different review scores to find potential multicollinearity
reduced_listings %>%
  select(log_price_4_nights, number_of_reviews,
         review_scores_rating, review_scores_cleanliness,
         review_scores_location, review_scores_value, reviews_per_month) %>%
  ggpairs(alpha=0.3)+
  theme_bw()

Many of the review variables exhibit high correlation and linear relationships with each other. Especially review_score_value, review_score_rating and review_score_cleanliness exhibit correlation. To avoid multicollinearity in our model, we decided to only use one of them. At first we used review_scores_cleanliness because it has the highest correlation with price. Later on in the process, we decided to use number_of_reviews instead, as it increased the explanatory power of our model and fits better logically

3.3.7 Correlation 2/4 - Numerical variables

# Show correlation among different numerical variables to find potential multicollinearity
reduced_listings %>%
  select(log_price_4_nights, bedrooms, accommodates, number_of_reviews, host_acceptance_rate ) %>%
  ggpairs(alpha=0.3)+
  theme_bw()

There is some correlation among the numerical values. Number of accomodates and bedrooms show a positive significant correlation of 0.5. This was expected as more accomodates require more beds and bedrooms. The other notable correlation we found was between host acceptance rate and number of reviews. This relationships also makes sense. The more guests a hosts accepts, the more reviews they will get.

3.3.8 Correlation 3/4 - Categorical variables

# Show correlation among values of categorical variables
reduced_listings_2 <- reduced_listings %>% 
  select(log_price_4_nights, bathrooms, neighbourhood_categorical, prop_type_simplified, room_type)

  model.matrix(~0+., data=reduced_listings_2) %>% 
  cor(use="everything") %>% 
  ggcorrplot(show.diag = F, type="lower", lab=TRUE, lab_size=4)

Some features of the categorical variables seem to be correlated. Entire rental units often feature 1 bathroom, so do entire service apartments. Private rooms in a rental unit often have only a shared bathroom, which is what you would expect. Apartments in the area Yau Tsim Mong tend to have one private bathroom. The other results do not tell us much. Ignore all correlation coefficients for mutually exclusive variables. Obviously, the property type private room in rental unit is heavily correlated with the room type private room

3.3.9 Correlation 4/4 - Numerical and categorical variables

# Show correlation among categorical variables
reduced_listings_3 <- reduced_listings %>% 
  select(log_price_4_nights, bathrooms, neighbourhood_categorical, prop_type_simplified, accommodates, host_acceptance_rate, number_of_reviews, bathrooms, bedrooms)

  model.matrix(~0+., data=reduced_listings_3) %>% 
  cor(use="everything") %>% 
  ggcorrplot(show.diag = F, type="lower", lab=TRUE, lab_size=4)

Across numerical and categorical variables, there is limited correlation. Accomodates (numerical) and 1 shared bathroom (categorical) exhibit slightly negative correlation. On the other hand, the correlation with 2 bathrooms is positive. This is what you would expect. The larger the number of accomodates, the more bathrooms there are. Accomodates is also negatively correlated to private rooms and positively correlated to entire rental units. Again, the same argument about size applies. In terms of geographical location, the Islands neighbourhood shows the highest correlation with accomodates. We expect the islands to be among the most expensive and largests domiciles and hence with the largest number of accomodates. The neighbourhood Wan Chai shows the highest negative correlation with number of reviews. This is perhaps a sign that Wan Chai is one of the most heavily populated areas with more listings than in demand.

3.3.10 Histograms and density plots for visualisations of selected variables

# Histogram for host response rate
ggplot(reduced_listings, aes(x = host_response_rate)) +
  geom_histogram(color="white") +
  labs(title = "Host response rate distribution", subtitle = "Histogram",
       x = "Host response rate (%)",
       y = "Number of listings") +
  theme_economist()+theme(axis.title.y = element_text(margin = margin(r=3)),axis.title.x = element_text(margin=margin(t=3)))+
  NULL

# Histogram for host acceptance rate
ggplot(reduced_listings, aes(x = host_acceptance_rate)) +
  geom_histogram(color="white") +
  labs(title = "Host acceptance rate distribution", subtitle = "Histogram",
       x = "Host acceptance rate (%)",
       y = "Number of listings") +
  theme_economist()+theme(axis.title.y = element_text(margin = margin(r=3)),axis.title.x = element_text(margin=margin(t=3)))+
  NULL

# Histogram for accommodates
ggplot(reduced_listings, aes(x = accommodates)) +
  geom_histogram(color="white") +
  labs(title = "Accommodates distribution", subtitle = "Histogram",
       x = "Number of person that can be accommodated",
       y = "Number of listings") +
  theme_economist()+theme(axis.title.y = element_text(margin = margin(r=3)),axis.title.x = element_text(margin=margin(t=3)))+
  NULL

# Density plot for beds distribution
ggplot(reduced_listings, aes(x = beds)) +
  geom_density()+
  labs(title = "Beds distribution", subtitle = "Density",
       x = "Number of beds",
       y = "Number of listings") +
  theme_economist()+theme(axis.title.y = element_text(margin = margin(r=3)),axis.title.x = element_text(margin=margin(t=3)))+
  NULL

# Density plot for availability_30
ggplot(reduced_listings, aes(x = availability_30)) +
  geom_density()+
  labs(title = "Availability over next 30 days distribution", subtitle = "Density",
       x = "Availability over next 30 days (in days)",
       y = "Number of listings") +
  theme_economist()+theme(axis.title.y = element_text(margin = margin(r=3)),axis.title.x = element_text(margin=margin(t=3)))+
  NULL

# Barchart for host response time
reduced_listings %>% 
  filter(host_response_time!="NA") %>% 
ggplot(aes(x = host_response_time))+ 
  geom_bar()+
  labs(title = "Host response time distribution", subtitle = "Barchart",
       x = "Host response time",
       y = "Number of listings") +
  theme_economist()+theme(axis.title.y = element_text(margin = margin(r=3)),axis.title.x = element_text(margin=margin(t=3)))+
  NULL

# Barchart for availability_30
reduced_listings %>% 
  filter(host_is_superhost!="NA") %>% 
ggplot(aes(x = host_is_superhost))+ 
  geom_bar()+
  labs(title = "Superhost distribution", subtitle = "Barchart",
       x = "Is the host a superhost?",
       y = "Number of listings") +
  theme_economist()+theme(axis.title.y = element_text(margin = margin(r=3)),axis.title.x = element_text(margin=margin(t=3)))+
  NULL

Here, we used some charts to visualise some important values, we expect to have a great influence on the price. It also allows us to identify outliers (e.g. host response rate of 0%, number of accomodates > 16).

3.4 Data wrangling

3.4.1 Data filtering

Given the fact that we are building a model for four nights, it is important to get a better understanding of the distribution of the minimum nights

# Table with minimum nights
reduced_listings %>%
count(minimum_nights, name="count", sort=TRUE) %>%
  rename("Minimum nights" = minimum_nights, "Frequency"="count") %>% 
   kable(align = "ll") %>% 
  kable_classic()
Minimum nights Frequency
1 2212
29 962
30 636
28 595
2 425
3 243
7 194
31 124
5 104
60 72
14 69
90 61
10 46
4 41
180 34
27 27
15 25
6 24
20 23
21 20
25 19
26 16
365 11
8 10
22 7
100 5
150 5
45 4
32 3
56 3
9 2
12 2
13 2
39 2
61 2
120 2
19 1
35 1
50 1
62 1
70 1
80 1
89 1
94 1
99 1
130 1
250 1
300 1
360 1
1125 1

The most common number of minimum nights is 1, followed by 29, 30, 28, 2, 3 and 7. Out of this list, the numbers 29, 30 and 28 clearly stand out. These “long-term” Airbnb bookings are different from a regular short-term booking, and are very likely to be a lease or sublet.

Since observations with a minimum of more than 4 nights would not be feasible for a model that predicts the price for four nights, these have been filtered out. As mentioned in the EDA, observations with a price of 0 and accommodates of 0 will be eliminated. Although one can argue that a maximum_nights of at least 4 nights could be added as a criteria, we have argued against it as someone can make a new booking with the new listing to prolong his or her stay

#Filtering the dataframe for minimum nights and faulty observations
reduced_listings <- reduced_listings %>% 
  filter (minimum_nights <= "4", price != "0", accommodates != 0)

4 Mapping

Below, you can find a map which shows all Airbnbs where the minimum nights is less than or equal to four nights in Hong Kong. Darker colors indicate more expensive airbnbs.

pal <- colorNumeric(palette = "BuPu", domain = reduced_listings$log_price_4_nights)
leaflet(data = filter(listings, minimum_nights <= 4)) %>% 
  addProviderTiles("OpenStreetMap.Mapnik") %>% 
  addCircleMarkers(lng = ~longitude, 
                   lat = ~latitude, 
                   radius = 1, 
                   fillColor = ~pal(log(price)), 
                   fillOpacity = 0.4, 
                   popup = ~listing_url,
                   label = ~property_type,
                   color = ~pal(log(price)))

5 Regression Analysis

5.0.1 Model 1

model1 <- lm(price_4_nights ~factor(prop_type_simplified) + number_of_reviews + review_scores_rating, data=reduced_listings)

msummary(model1)
                                                        Estimate Std. Error
(Intercept)                                             3609.492   1202.609
factor(prop_type_simplified)Entire rental unit          1301.983    893.123
factor(prop_type_simplified)Entire serviced apartment    394.116   1530.659
factor(prop_type_simplified)Other                       -242.214    876.853
factor(prop_type_simplified)Private room in rental unit -836.183    898.410
number_of_reviews                                         -6.880      3.217
review_scores_rating                                     110.317    200.145
                                                        t value Pr(>|t|)   
(Intercept)                                               3.001  0.00271 **
factor(prop_type_simplified)Entire rental unit            1.458  0.14502   
factor(prop_type_simplified)Entire serviced apartment     0.257  0.79683   
factor(prop_type_simplified)Other                        -0.276  0.78239   
factor(prop_type_simplified)Private room in rental unit  -0.931  0.35207   
number_of_reviews                                        -2.139  0.03255 * 
review_scores_rating                                      0.551  0.58155   

Residual standard error: 9566 on 2703 degrees of freedom
  (2852 observations deleted due to missingness)
Multiple R-squared:  0.00887,   Adjusted R-squared:  0.00667 
F-statistic: 4.032 on 6 and 2703 DF,  p-value: 0.0005005

The coefficient of review_scores_rating is -6.398. This means that the price per 4 nights will decrease by 6.398 USD if one extra review is written about the listing. The coefficient is significant at p=0.05.

The variable prop_type_simplified is divided among five categories (Private room in rental unit, Entire rental unit, Entire condominium (condo), Entire serviced apartment and other). In model 1, entire condominium (condo) is chosen as the baseline variable. The coefficient of each property type, can be interpreted as the USD change if the property type changes from an entire condominium (condo) to the property type of the coefficient. Given the fact that none of the dummy variables for property types is significant, the property type does not appear to be a predictor of airbnb prices per 4 nights in this model

5.0.2 Model 2

Next, we consider the type of room that was listed

model2 <- lm(price_4_nights ~factor(prop_type_simplified) +number_of_reviews + review_scores_rating + factor(room_type), data=reduced_listings)

msummary(model2)
                                                         Estimate Std. Error
(Intercept)                                              3603.502   1202.796
factor(prop_type_simplified)Entire rental unit           1300.857    891.919
factor(prop_type_simplified)Entire serviced apartment     394.638   1528.577
factor(prop_type_simplified)Other                        1599.089   1140.863
factor(prop_type_simplified)Private room in rental unit   929.918   1253.650
number_of_reviews                                          -6.820      3.223
review_scores_rating                                      111.332    200.505
factor(room_type)Hotel room                             -2858.442   1316.354
factor(room_type)Private room                           -1766.373    878.149
factor(room_type)Shared room                            -3400.340   1112.623
                                                        t value Pr(>|t|)   
(Intercept)                                               2.996  0.00276 **
factor(prop_type_simplified)Entire rental unit            1.458  0.14482   
factor(prop_type_simplified)Entire serviced apartment     0.258  0.79629   
factor(prop_type_simplified)Other                         1.402  0.16114   
factor(prop_type_simplified)Private room in rental unit   0.742  0.45829   
number_of_reviews                                        -2.116  0.03442 * 
review_scores_rating                                      0.555  0.57876   
factor(room_type)Hotel room                              -2.171  0.02998 * 
factor(room_type)Private room                            -2.011  0.04437 * 
factor(room_type)Shared room                             -3.056  0.00226 **

Residual standard error: 9553 on 2700 degrees of freedom
  (2852 observations deleted due to missingness)
Multiple R-squared:  0.01266,   Adjusted R-squared:  0.009372 
F-statistic: 3.848 on 9 and 2700 DF,  p-value: 7.437e-05

The base level is entire home/apartment, and the remaining categories are hotel room, private room, and shared room. Choosing a hotel room or private room is significant at the 5% level, and choosing a shared room is significant at the 1% level. The way to interpret these is that the price per 4 nights will decrease by USD 2858 for a hotel room compared to an entire home apartment in a condo. Similarly, the price for a private room is USD 1766 less compared to an entire home/apartment, and USD 3400 less for a shared room, which follows logic that shared rooms would be the cheapest.

For the remainder of this assignment, we will use the log_price_4_nights variable instead of price_4_nights. Since prop_type_simplified and number_of_reviews were insignificant, we will remove them from the next model. First, we will check whether the number of bathrooms, bedrooms, beds, or size of the house (accomodates) are significant predictors. We will also check whether these are co-linear variables. The reasoning for this is that in the correlation matrix, these seemed to have a significant correlation with the dependent variable of price.

5.0.3 Model 3

# New model
model3 <- lm(log_price_4_nights ~ number_of_reviews + factor(room_type) + bathrooms + bedrooms + beds + accommodates, data=reduced_listings)
msummary(model3)
                                Estimate Std. Error t value Pr(>|t|)    
(Intercept)                    7.5787487  0.0324557 233.511  < 2e-16 ***
number_of_reviews             -0.0002478  0.0002430  -1.019    0.308    
factor(room_type)Hotel room   -0.9904779  0.0829418 -11.942  < 2e-16 ***
factor(room_type)Private room -0.8739274  0.0312877 -27.932  < 2e-16 ***
factor(room_type)Shared room  -1.1078376  0.0565321 -19.597  < 2e-16 ***
bathrooms1 private bath        0.5777998  0.0348096  16.599  < 2e-16 ***
bathrooms1 shared bath        -0.0340561  0.0353094  -0.965    0.335    
bathrooms2 baths               0.2920691  0.0531295   5.497 4.07e-08 ***
bathroomsOther                -0.0359973  0.0387997  -0.928    0.354    
bedrooms                       0.1291012  0.0178676   7.225 5.86e-13 ***
beds                           0.0099620  0.0101364   0.983    0.326    
accommodates                   0.0728291  0.0067308  10.820  < 2e-16 ***

Residual standard error: 0.7338 on 4404 degrees of freedom
  (1146 observations deleted due to missingness)
Multiple R-squared:  0.3832,    Adjusted R-squared:  0.3817 
F-statistic: 248.7 on 11 and 4404 DF,  p-value: < 2.2e-16
# Check for co-linearity
car::vif(model3)
                      GVIF Df GVIF^(1/(2*Df))
number_of_reviews 1.044338  1        1.021929
factor(room_type) 2.285178  3        1.147678
bathrooms         2.161502  4        1.101145
bedrooms          1.767274  1        1.329389
beds              2.273932  1        1.507956
accommodates      2.344080  1        1.531039

It appears that the bathrooms, bedrooms and accommodates are significant variables. Surprisingly, there is no co linearity between these variables (since the VIF is lower than 5). Also, adopting a log price model helps increase the explanatory power of the model to 38%. It also increases the significance of the variables. Given that the dependent variable is log transformed, the interpretation of coefficients changes. As before, the room type affects the price per 4 nights and having a shared room decreases the price. The new independent variables controls for the number of baths, beds, and the people that can be accommodated. There are a few noteworthy points

  1. Beds and accommodates are not correlated in the model, even though generally the number of beds should be related to the max people that can be accommodated
  2. For an accommodation, in an entire room/apartment, in a condo property type, having 2 baths increases the price per 4 nights by 33%
  3. For every extra bedroom in an entire room/apartment in a condo, the price per 4 nights increases by 12% (We get this by (exp(.12) -1)*100 )
  4. For every additional person that the listing can accommodate, the price per 4 nights increases by 7.4%

5.0.4 Model 4

After removing the insignificant variables (number_of_reviews and beds), we will check whether superhosts (host_is_superhost) command a pricing premium

# New model
model4 <- lm(log_price_4_nights ~ factor(room_type) + bathrooms + bedrooms + accommodates +factor(host_is_superhost), data=reduced_listings)
msummary(model4)
                               Estimate Std. Error t value Pr(>|t|)    
(Intercept)                    7.626631   0.031686 240.695  < 2e-16 ***
factor(room_type)Hotel room   -0.984526   0.081533 -12.075  < 2e-16 ***
factor(room_type)Private room -0.820064   0.031032 -26.426  < 2e-16 ***
factor(room_type)Shared room  -1.083387   0.052978 -20.450  < 2e-16 ***
bathrooms1 private bath        0.499003   0.034108  14.630  < 2e-16 ***
bathrooms1 shared bath        -0.042505   0.034407  -1.235    0.217    
bathrooms2 baths               0.324447   0.052277   6.206 5.91e-10 ***
bathroomsOther                -0.039004   0.038025  -1.026    0.305    
bedrooms                       0.125056   0.016978   7.366 2.08e-13 ***
accommodates                   0.073037   0.005579  13.090  < 2e-16 ***
factor(host_is_superhost)TRUE -0.238758   0.026637  -8.963  < 2e-16 ***

Residual standard error: 0.7245 on 4517 degrees of freedom
  (1034 observations deleted due to missingness)
Multiple R-squared:  0.3891,    Adjusted R-squared:  0.3877 
F-statistic: 287.6 on 10 and 4517 DF,  p-value: < 2.2e-16
# Check for co-linearity
car::vif(model4)
                              GVIF Df GVIF^(1/(2*Df))
factor(room_type)         2.126925  3        1.134032
bathrooms                 2.165278  4        1.101385
bedrooms                  1.675540  1        1.294427
accommodates              1.672316  1        1.293180
factor(host_is_superhost) 1.082233  1        1.040304

The host_is_superhost variable is significant at p=0.01 and the other independent variable types are significant, like before. Remarkably, it appears that the log price per 4 nights decreases when a host is a superhost. Specifically, when the host is a superhost, the price per 4 nights decreases by 26%.

5.0.5 Model 5

Some hosts allow you to immediately book their listing (instant_bookable == TRUE), while a non-trivial proportion don’t. After controlling for other variables, we will check whether instant_bookable a significant predictor

# New model
model5 <- lm(log_price_4_nights ~ factor(room_type) + bathrooms + bedrooms + accommodates +factor(host_is_superhost) + factor(instant_bookable), data=reduced_listings)
msummary(model5)
                               Estimate Std. Error t value Pr(>|t|)    
(Intercept)                    7.562731   0.031758 238.136  < 2e-16 ***
factor(room_type)Hotel room   -1.070046   0.080768 -13.248  < 2e-16 ***
factor(room_type)Private room -0.803659   0.030640 -26.229  < 2e-16 ***
factor(room_type)Shared room  -1.105606   0.052287 -21.145  < 2e-16 ***
bathrooms1 private bath        0.437309   0.034079  12.832  < 2e-16 ***
bathrooms1 shared bath        -0.023933   0.033974  -0.704    0.481    
bathrooms2 baths               0.343148   0.051585   6.652 3.23e-11 ***
bathroomsOther                -0.042205   0.037504  -1.125    0.260    
bedrooms                       0.121732   0.016748   7.269 4.26e-13 ***
accommodates                   0.067756   0.005523  12.269  < 2e-16 ***
factor(host_is_superhost)TRUE -0.215427   0.026352  -8.175 3.81e-16 ***
factor(instant_bookable)TRUE   0.279986   0.024776  11.301  < 2e-16 ***

Residual standard error: 0.7145 on 4516 degrees of freedom
  (1034 observations deleted due to missingness)
Multiple R-squared:  0.4059,    Adjusted R-squared:  0.4044 
F-statistic: 280.4 on 11 and 4516 DF,  p-value: < 2.2e-16
# Check for co-linearity
car::vif(model5)
                              GVIF Df GVIF^(1/(2*Df))
factor(room_type)         2.168170  3        1.137668
bathrooms                 2.268737  4        1.107830
bedrooms                  1.676057  1        1.294626
accommodates              1.684376  1        1.297835
factor(host_is_superhost) 1.088916  1        1.043511
factor(instant_bookable)  1.105441  1        1.051399

The instant_bookable variable is significant at p=0.01. It appears that a price premium is paid if a listing is instantly bookable. This follows, as the convenience of being able to book instantly would push people to be willing to spend higher. The model also now explains 40% of the data. The way to interpret the coefficient on a property being bookable is that for an entire apartment/house increases the price by 31% as opposed to a property that is not instantly bookable.

5.0.6 Model 6

Next, we will inspect the impact of the neighborhood. For this, we have the listings in Hong Kong into five areas. Moreover, we will investigate important amenities, host identity verification, host acceptance rate, host listings and number of reviews

# New model
model6 <- lm(log_price_4_nights ~ factor(room_type) + bathrooms + bedrooms + accommodates+ factor(instant_bookable) + neighbourhood_categorical + important_amenities + factor(host_identity_verified)  + host_acceptance_rate + calculated_host_listings_count + number_of_reviews, data=reduced_listings)
msummary(model6)
                                         Estimate Std. Error t value Pr(>|t|)
(Intercept)                             7.3892700  0.0871529  84.785  < 2e-16
factor(room_type)Hotel room            -0.9328048  0.0850231 -10.971  < 2e-16
factor(room_type)Private room          -0.6486390  0.0355496 -18.246  < 2e-16
factor(room_type)Shared room           -0.8817947  0.0576430 -15.298  < 2e-16
bathrooms1 private bath                 0.3832317  0.0393760   9.733  < 2e-16
bathrooms1 shared bath                  0.0261429  0.0338742   0.772 0.440309
bathrooms2 baths                        0.4057886  0.0512915   7.911 3.44e-15
bathroomsOther                         -0.0730408  0.0393916  -1.854 0.063796
bedrooms                                0.1053971  0.0178175   5.915 3.65e-09
accommodates                            0.0536096  0.0061368   8.736  < 2e-16
factor(instant_bookable)TRUE            0.2092476  0.0314237   6.659 3.22e-11
neighbourhood_categoricalIslands        0.2055465  0.0583848   3.521 0.000436
neighbourhood_categoricalOther         -0.1148559  0.0385546  -2.979 0.002912
neighbourhood_categoricalWan Chai      -0.1447215  0.0358287  -4.039 5.48e-05
neighbourhood_categoricalYau Tsim Mong -0.1547758  0.0366908  -4.218 2.53e-05
important_amenities                     0.1328478  0.0232778   5.707 1.25e-08
factor(host_identity_verified)TRUE      0.1684831  0.0272263   6.188 6.83e-10
host_acceptance_rate                   -0.0018740  0.0004422  -4.238 2.32e-05
calculated_host_listings_count         -0.0011920  0.0001306  -9.128  < 2e-16
number_of_reviews                      -0.0010516  0.0002336  -4.502 6.95e-06
                                          
(Intercept)                            ***
factor(room_type)Hotel room            ***
factor(room_type)Private room          ***
factor(room_type)Shared room           ***
bathrooms1 private bath                ***
bathrooms1 shared bath                    
bathrooms2 baths                       ***
bathroomsOther                         .  
bedrooms                               ***
accommodates                           ***
factor(instant_bookable)TRUE           ***
neighbourhood_categoricalIslands       ***
neighbourhood_categoricalOther         ** 
neighbourhood_categoricalWan Chai      ***
neighbourhood_categoricalYau Tsim Mong ***
important_amenities                    ***
factor(host_identity_verified)TRUE     ***
host_acceptance_rate                   ***
calculated_host_listings_count         ***
number_of_reviews                      ***

Residual standard error: 0.622 on 3321 degrees of freedom
  (2221 observations deleted due to missingness)
Multiple R-squared:  0.5395,    Adjusted R-squared:  0.5369 
F-statistic: 204.8 on 19 and 3321 DF,  p-value: < 2.2e-16
# Check for co-linearity
car::vif(model6)
                                   GVIF Df GVIF^(1/(2*Df))
factor(room_type)              2.926549  3        1.195986
bathrooms                      3.108303  4        1.152300
bedrooms                       1.807903  1        1.344583
accommodates                   1.976054  1        1.405722
factor(instant_bookable)       1.577734  1        1.256079
neighbourhood_categorical      1.758945  4        1.073140
important_amenities            1.730228  1        1.315381
factor(host_identity_verified) 1.595662  1        1.263195
host_acceptance_rate           1.446151  1        1.202560
calculated_host_listings_count 3.448548  1        1.857027
number_of_reviews              1.173653  1        1.083353

Overall, the neighborhood has a significant impact on the price per 4 nights. The new model, which controls for whether or not the property has important amenities, the number of host listings, and the type of neighborhood, explains 53% of the variability in data. The new independent variables are all significant at the 1% level. The base neighborhood is Central & Western and the interpretation on the coefficients is:

  1. For a property that has important amenities, the definition of which has been given above, the price per 4 nights increases by 14%
  2. If a property has an additional review, strangely, the price per 4 nights decreases by .10%. This could be explained by the fact that an additional review doesn’t mean it is a positive review. An additional review could bring down the rating, which could bring down the price.
  3. If the host acceptance rate increases by 1%, the price per 4 nights of an entire house/apartment in Central and Western decreases by .18%
  4. If the number of listings a host has increase by 1, the price per 4 nights decreases by .12%
  5. Living in an entire house/apartment in any neighborhood, except for the Islands, will decrease the price for 4 nights by 13%. Living in the Islands increases the price for 4 nights by 22%, meaning that Central and Western is the 2nd most expensive neighborhood

5.0.7 Model 7

Next, we will incorporate the effect of avalability_30 or reviews_per_month on log_price_4_nights

# New model
model7 <- lm(log_price_4_nights ~ factor(room_type) + bathrooms + bedrooms + accommodates +factor(host_is_superhost) + factor(instant_bookable) + neighbourhood_categorical + important_amenities + factor(host_identity_verified)  + host_acceptance_rate + calculated_host_listings_count + availability_30 + reviews_per_month, data=reduced_listings)
msummary(model7)
                                         Estimate Std. Error t value Pr(>|t|)
(Intercept)                             7.2867542  0.1511163  48.220  < 2e-16
factor(room_type)Hotel room            -1.0009688  0.1130183  -8.857  < 2e-16
factor(room_type)Private room          -0.6131677  0.0614492  -9.978  < 2e-16
factor(room_type)Shared room           -1.4889174  0.1091533 -13.641  < 2e-16
bathrooms1 private bath                 0.4475541  0.0613657   7.293 4.83e-13
bathrooms1 shared bath                  0.0141115  0.0748819   0.188 0.850549
bathrooms2 baths                        0.4001954  0.0845549   4.733 2.42e-06
bathroomsOther                         -0.0377981  0.0628356  -0.602 0.547569
bedrooms                                0.0836681  0.0291892   2.866 0.004208
accommodates                            0.0772091  0.0098076   7.872 6.53e-15
factor(host_is_superhost)TRUE          -0.0043928  0.0459171  -0.096 0.923796
factor(instant_bookable)TRUE            0.2733612  0.0421635   6.483 1.21e-10
neighbourhood_categoricalIslands        0.2006377  0.0859812   2.334 0.019750
neighbourhood_categoricalOther         -0.2016518  0.0717878  -2.809 0.005033
neighbourhood_categoricalWan Chai      -0.1864330  0.0801457  -2.326 0.020139
neighbourhood_categoricalYau Tsim Mong -0.2867385  0.0677304  -4.234 2.44e-05
important_amenities                     0.1511906  0.0421489   3.587 0.000345
factor(host_identity_verified)TRUE      0.2692909  0.0430306   6.258 5.04e-10
host_acceptance_rate                   -0.0027050  0.0006604  -4.096 4.43e-05
calculated_host_listings_count         -0.0006289  0.0003103  -2.027 0.042855
availability_30                         0.0017441  0.0014903   1.170 0.242046
reviews_per_month                      -0.0276561  0.0115593  -2.393 0.016852
                                          
(Intercept)                            ***
factor(room_type)Hotel room            ***
factor(room_type)Private room          ***
factor(room_type)Shared room           ***
bathrooms1 private bath                ***
bathrooms1 shared bath                    
bathrooms2 baths                       ***
bathroomsOther                            
bedrooms                               ** 
accommodates                           ***
factor(host_is_superhost)TRUE             
factor(instant_bookable)TRUE           ***
neighbourhood_categoricalIslands       *  
neighbourhood_categoricalOther         ** 
neighbourhood_categoricalWan Chai      *  
neighbourhood_categoricalYau Tsim Mong ***
important_amenities                    ***
factor(host_identity_verified)TRUE     ***
host_acceptance_rate                   ***
calculated_host_listings_count         *  
availability_30                           
reviews_per_month                      *  

Residual standard error: 0.7091 on 1535 degrees of freedom
  (4005 observations deleted due to missingness)
Multiple R-squared:  0.4329,    Adjusted R-squared:  0.4251 
F-statistic: 55.79 on 21 and 1535 DF,  p-value: < 2.2e-16
# Check for co-linearity
car::vif(model7)
                                   GVIF Df GVIF^(1/(2*Df))
factor(room_type)              4.594357  3        1.289350
bathrooms                      3.939700  4        1.186951
bedrooms                       2.043575  1        1.429537
accommodates                   2.136886  1        1.461809
factor(host_is_superhost)      1.142213  1        1.068743
factor(instant_bookable)       1.324012  1        1.150657
neighbourhood_categorical      1.841446  4        1.079307
important_amenities            1.368281  1        1.169736
factor(host_identity_verified) 1.176545  1        1.084686
host_acceptance_rate           1.339702  1        1.157455
calculated_host_listings_count 1.692133  1        1.300820
availability_30                1.177546  1        1.085148
reviews_per_month              1.185278  1        1.088705

Overall, it appears that availability_30 is not a predictor of log prices, since it is insignificant. For the remainder of this project, we will use model 6 as our best model because of the adjusted R2 and significance of variables.

5.0.8 Residuals

To check the robustness of our established model, model6, we run a series of diagnostic tests.

autoplot(model6)

Since the QQ plot shows that the residuals don’t follow a normal distribution, we cannot use msummary to establish the interpretation of coefficients as we need a more robust estimator for our t-tests and confidence intervals. However, OLS is still the BLUE. To double check this, we run a Shapiro W Test and a Breusch - Pagan t test. The null hypothesis for the shapiro test is that the residuals follow a normal distribution and the null for the Breusch Pagan is that the data is homoskedastic.

shapiro.test(model6$residuals)

    Shapiro-Wilk normality test

data:  model6$residuals
W = 0.83712, p-value < 2.2e-16
bptest(model6)

    studentized Breusch-Pagan test

data:  model6
BP = 231.07, df = 19, p-value < 2.2e-16

We reject the null hypothesis of normality at the 1% level and reject the null hypothesis of homoskedasticity at the 1% level . Thus, to get robust estimates of the coefficients, we use a robust t test. We selected the HC0 type as the other types are more effective for small samples.

coeftest(model6, vcov = vcovHC(model6, type = "HC0"))

t test of coefficients:

                                          Estimate  Std. Error  t value
(Intercept)                             7.38927000  0.08760343  84.3491
factor(room_type)Hotel room            -0.93280477  0.08716610 -10.7015
factor(room_type)Private room          -0.64863901  0.03754449 -17.2765
factor(room_type)Shared room           -0.88179474  0.06682907 -13.1948
bathrooms1 private bath                 0.38323174  0.04362637   8.7844
bathrooms1 shared bath                  0.02614289  0.02371966   1.1022
bathrooms2 baths                        0.40578862  0.05211780   7.7860
bathroomsOther                         -0.07304076  0.04174750  -1.7496
bedrooms                                0.10539706  0.03106102   3.3932
accommodates                            0.05360962  0.00771786   6.9462
factor(instant_bookable)TRUE            0.20924761  0.04132860   5.0630
neighbourhood_categoricalIslands        0.20554654  0.04890183   4.2032
neighbourhood_categoricalOther         -0.11485588  0.03481534  -3.2990
neighbourhood_categoricalWan Chai      -0.14472146  0.02500214  -5.7884
neighbourhood_categoricalYau Tsim Mong -0.15477582  0.03073294  -5.0362
important_amenities                     0.13284784  0.02104873   6.3114
factor(host_identity_verified)TRUE      0.16848315  0.02910709   5.7884
host_acceptance_rate                   -0.00187403  0.00043929  -4.2660
calculated_host_listings_count         -0.00119198  0.00013456  -8.8582
number_of_reviews                      -0.00105163  0.00022631  -4.6469
                                        Pr(>|t|)    
(Intercept)                            < 2.2e-16 ***
factor(room_type)Hotel room            < 2.2e-16 ***
factor(room_type)Private room          < 2.2e-16 ***
factor(room_type)Shared room           < 2.2e-16 ***
bathrooms1 private bath                < 2.2e-16 ***
bathrooms1 shared bath                 0.2704715    
bathrooms2 baths                       9.173e-15 ***
bathroomsOther                         0.0802825 .  
bedrooms                               0.0006988 ***
accommodates                           4.497e-12 ***
factor(instant_bookable)TRUE           4.351e-07 ***
neighbourhood_categoricalIslands       2.700e-05 ***
neighbourhood_categoricalOther         0.0009805 ***
neighbourhood_categoricalWan Chai      7.768e-09 ***
neighbourhood_categoricalYau Tsim Mong 5.003e-07 ***
important_amenities                    3.131e-10 ***
factor(host_identity_verified)TRUE     7.767e-09 ***
host_acceptance_rate                   2.045e-05 ***
calculated_host_listings_count         < 2.2e-16 ***
number_of_reviews                      3.500e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

5.0.9 Correcting for heteroskedasticity

To correct for the heteroskedasticity, we use a WLS adjustment on model 6

# heteroskedasticity is present, use WLS 
wt <- 1/lm(abs(model6$residuals)~model6$fitted.values, na.action = "na.exclude")$fitted.values^2
wls_model_6 <-  lm(log_price_4_nights ~ factor(room_type) + bathrooms + bedrooms + accommodates +factor(host_is_superhost) + factor(instant_bookable) + neighbourhood_categorical, data=reduced_listings)

skim(reduced_listings$log_price_4_nights)
Data summary
Name reduced_listings$log_pric…
Number of rows 5562
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 0 1 7.46 0.89 1.39 6.67 7.33 7.92 12.73 <U+2581><U+2581><U+2587><U+2582><U+2581>
which(is.na(reduced_listings$log_price_4_nights))
integer(0)
skim(model6$residuals)
Data summary
Name model6$residuals
Number of rows 3341
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 0 1 0 0.62 -2.21 -0.33 -0.06 0.19 5.61 <U+2581><U+2587><U+2581><U+2581><U+2581>

The QQplot shows that the non-normality has been corrected, although there is still the presence of large outliers with high leverage.

autoplot(wls_model_6)

#Interaction between variables
coplot(log_price_4_nights ~ accommodates | neighbourhood_categorical * important_amenities,data = reduced_listings)

We can see from the co-plot the varying distributions across different neighbourhoods. We can see that across the plots that Yau Tsim Mong tends to have more high accommodation airbnbs. We also see a spike in higher price airbnbs in the Other and Islands neighbourhood. For the amenities, we see that there are generally more high amenity airbnbs across the neighbourhoods and an increase in number of larger places as amenities increases.

5.0.10 Model comparison

Now, to compare all the models we have

huxreg(list("Model 1" = model1,"Model 2" =  model2,"Model 3" =  model3,"Model 4" =  model4,"Model 5" =  model5, "Model 6" = model6, "Model 7" = model7),
             statistics = c('R squared' = 'r.squared', 
                            'Adj. R Squared' = 'adj.r.squared', 
                            'Residual SE' = 'sigma'), 
                 bold_signif = 0.05
) %>% 
  set_caption('Comparison of models')

Comparison of models
Model 1Model 2Model 3Model 4Model 5Model 6Model 7
(Intercept)3609.492 **3603.502 **7.579 ***7.627 ***7.563 ***7.389 ***7.287 ***
(1202.609)  (1202.796)  (0.032)   (0.032)   (0.032)   (0.087)   (0.151)   
factor(prop_type_simplified)Entire rental unit1301.983   1300.857                                           
(893.123)  (891.919)                                          
factor(prop_type_simplified)Entire serviced apartment394.116   394.638                                           
(1530.659)  (1528.577)                                          
factor(prop_type_simplified)Other-242.214   1599.089                                           
(876.853)  (1140.863)                                          
factor(prop_type_simplified)Private room in rental unit-836.183   929.918                                           
(898.410)  (1253.650)                                          
number_of_reviews-6.880 * -6.820 * -0.000                    -0.001 ***        
(3.217)  (3.223)  (0.000)                   (0.000)           
review_scores_rating110.317   111.332                                           
(200.145)  (200.505)                                          
factor(room_type)Hotel room       -2858.442 * -0.990 ***-0.985 ***-1.070 ***-0.933 ***-1.001 ***
       (1316.354)  (0.083)   (0.082)   (0.081)   (0.085)   (0.113)   
factor(room_type)Private room       -1766.373 * -0.874 ***-0.820 ***-0.804 ***-0.649 ***-0.613 ***
       (878.149)  (0.031)   (0.031)   (0.031)   (0.036)   (0.061)   
factor(room_type)Shared room       -3400.340 **-1.108 ***-1.083 ***-1.106 ***-0.882 ***-1.489 ***
       (1112.623)  (0.057)   (0.053)   (0.052)   (0.058)   (0.109)   
bathrooms1 private bath              0.578 ***0.499 ***0.437 ***0.383 ***0.448 ***
              (0.035)   (0.034)   (0.034)   (0.039)   (0.061)   
bathrooms1 shared bath              -0.034    -0.043    -0.024    0.026    0.014    
              (0.035)   (0.034)   (0.034)   (0.034)   (0.075)   
bathrooms2 baths              0.292 ***0.324 ***0.343 ***0.406 ***0.400 ***
              (0.053)   (0.052)   (0.052)   (0.051)   (0.085)   
bathroomsOther              -0.036    -0.039    -0.042    -0.073    -0.038    
              (0.039)   (0.038)   (0.038)   (0.039)   (0.063)   
bedrooms              0.129 ***0.125 ***0.122 ***0.105 ***0.084 ** 
              (0.018)   (0.017)   (0.017)   (0.018)   (0.029)   
beds              0.010                                    
              (0.010)                                   
accommodates              0.073 ***0.073 ***0.068 ***0.054 ***0.077 ***
              (0.007)   (0.006)   (0.006)   (0.006)   (0.010)   
factor(host_is_superhost)TRUE                      -0.239 ***-0.215 ***        -0.004    
                      (0.027)   (0.026)           (0.046)   
factor(instant_bookable)TRUE                              0.280 ***0.209 ***0.273 ***
                              (0.025)   (0.031)   (0.042)   
neighbourhood_categoricalIslands                                      0.206 ***0.201 *  
                                      (0.058)   (0.086)   
neighbourhood_categoricalOther                                      -0.115 ** -0.202 ** 
                                      (0.039)   (0.072)   
neighbourhood_categoricalWan Chai                                      -0.145 ***-0.186 *  
                                      (0.036)   (0.080)   
neighbourhood_categoricalYau Tsim Mong                                      -0.155 ***-0.287 ***
                                      (0.037)   (0.068)   
important_amenities                                      0.133 ***0.151 ***
                                      (0.023)   (0.042)   
factor(host_identity_verified)TRUE                                      0.168 ***0.269 ***
                                      (0.027)   (0.043)   
host_acceptance_rate                                      -0.002 ***-0.003 ***
                                      (0.000)   (0.001)   
calculated_host_listings_count                                      -0.001 ***-0.001 *  
                                      (0.000)   (0.000)   
availability_30                                              0.002    
                                              (0.001)   
reviews_per_month                                              -0.028 *  
                                              (0.012)   
R squared0.009   0.013   0.383    0.389    0.406    0.539    0.433    
Adj. R Squared0.007   0.009   0.382    0.388    0.404    0.537    0.425    
Residual SE9565.743   9552.724   0.734    0.724    0.715    0.622    0.709    
*** p < 0.001; ** p < 0.01; * p < 0.05.
### Testing the model

#Testing the model
set.seed(123)
train_test_split <- initial_split(reduced_listings, prop = 0.75)
listings_train <- training(train_test_split)
listings_test <- testing(train_test_split)

rmse_train <- listings_train %>%
  mutate(predictions = predict(model6, .)) %>%
  summarise(sqrt(sum(predictions - log_price_4_nights, na.rm = TRUE) ^ 2/n())) %>%
  pull()
rmse_train
[1] 0.06453211
rmse_test <- listings_test %>%
  mutate(predictions = predict(model6, .)) %>%
  summarise(sqrt(sum(predictions - log_price_4_nights, na.rm = TRUE) ^ 2/n())) %>%
  pull()
rmse_test
[1] 0.1117461

Given that the RMSE for the training and testing model have a small margin of error, it proves that model6 isn’t overfitting the data

6 Prediction

unique(reduced_listings$bathrooms)
[1] "1 bath"         "Other"          "2 baths"        "1 private bath"
[5] "1 shared bath" 
testingairbnb = data.frame(room_type = c("Private room","Private room","Private room","Private room","Private room"),bathrooms = c("1 private bath","Other","1 private bath","1 private bath","2 baths"), bedrooms = c(1,2,1,1,2), accommodates = c(2,6,2,2,5), instant_bookable = c(TRUE,FALSE,TRUE,TRUE,TRUE),important_amenities = c(3,2,3,3,3), host_identity_verified=c(TRUE,TRUE,TRUE,TRUE,TRUE),calculated_host_listings_count = c(5,5,3,28,1), number_of_reviews = c(100,31,10,40,21),host_acceptance_rate = c(100,50,100,25,25), neighbourhood_categorical = c("Yau Tsim Mong","Islands","Wan Chai","Other","Other"))
exp(predict(model6, newdata = testingairbnb, interval = "confidence"))/4
       fit      lwr       upr
1 530.1486 495.9108  566.7502
2 556.2463 490.1354  631.2744
3 590.0706 542.3562  641.9826
4 658.0704 591.5738  732.0417
5 925.4304 803.6984 1065.6005

We can use the above data to predict the price for 4 nights based on different parameters. The cheapest type of room would be a private room, with a private bath, one bedroom, that accommodates 2, in Yau Tsim Mong. The property is instantly bookable, has 3 important amenities, with a verified host who has 5 listings, 100 reviews, and accepts bookings 100% of the time. The price per night for this type of property is USD530, which means the price for 4 nights is USD2120.

Conversely, the most expensive accommodation is a private room with 2 baths, 2 bedrooms, that accommodates 5, is instantly bookable with 3 important amenities located in a neighbourhood outside the most popular 3 (Yau Tsim Mong, Islands, Wan Chai). The host’s identity is verified, has 1 listing, has received 21 reviews, and accepts bookings 25% of the time. The price per night is USD 925, and the price per 4 nights is USD3700

When compared to the official prices as listed on AirBnB, the property prices per night are USD 514, 617, 411, 1554, and 1029 respectively. We can see that the prices for the properties are skewed towards the higher end of the CI, which could be explained by the heteroskedasticity of the data as well as the fact that COVID might have had an impact on the data used in the model, whereas latest prices have significantly changed since then.

In addition, as our model was trained with middle to high-end data, the only properties this has predictive power for is other middle to high-end properties as we can not extrapolate beyond the scope of our data. The 25th percentile of our prices is around 250 USD per night, so our predictive power decreases for low end airbnbs. The same is true with ultra-high end properties.

```

7 Acknowledgements